astronauts <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-07-14/astronauts.csv')
https://github.com/rfordatascience/tidytuesday/tree/master/data/2020/2020-07-14
# Look at the data
glimpse(astronauts)
## Rows: 1,277
## Columns: 24
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14~
## $ number <dbl> 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, ~
## $ nationwide_number <dbl> 1, 2, 1, 1, 2, 2, 2, 4, 4, 3, 3, 3, 4, 4, 5, ~
## $ name <chr> "Gagarin, Yuri", "Titov, Gherman", "Glenn, Jo~
## $ original_name <chr> "<U+0413><U+0410><U+0413><U+0410><U+0420><U+0418><U+041D> <U+042E><U+0440><U+0438><U+0439> <U+0410><U+043B><U+0435><U+043A><U+0441><U+0435><U+0435><U+0432><U+0438><U+0447>", "<U+0422><U+0418><U+0422><U+041E><U+0412> <U+0413><U+0435><U+0440><U+043C><U+0430><U+043D> <U+0421><U+0442><U+0435><U+043F>~
## $ sex <chr> "male", "male", "male", "male", "male", "male~
## $ year_of_birth <dbl> 1934, 1935, 1921, 1921, 1925, 1929, 1929, 193~
## $ nationality <chr> "U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.", "~
## $ military_civilian <chr> "military", "military", "military", "military~
## $ selection <chr> "TsPK-1", "TsPK-1", "NASA Astronaut Group 1",~
## $ year_of_selection <dbl> 1960, 1960, 1959, 1959, 1959, 1960, 1960, 196~
## $ mission_number <dbl> 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 3, 1, 2, 1, ~
## $ total_number_of_missions <dbl> 1, 1, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 2, 3, ~
## $ occupation <chr> "pilot", "pilot", "pilot", "PSP", "Pilot", "p~
## $ year_of_mission <dbl> 1961, 1961, 1962, 1998, 1962, 1962, 1970, 196~
## $ mission_title <chr> "Vostok 1", "Vostok 2", "MA-6", "STS-95", "Me~
## $ ascend_shuttle <chr> "Vostok 1", "Vostok 2", "MA-6", "STS-95", "Me~
## $ in_orbit <chr> "Vostok 2", "Vostok 2", "MA-6", "STS-95", "Me~
## $ descend_shuttle <chr> "Vostok 3", "Vostok 2", "MA-6", "STS-95", "Me~
## $ hours_mission <dbl> 1.77, 25.00, 5.00, 213.00, 5.00, 94.00, 424.0~
## $ total_hrs_sum <dbl> 1.77, 25.30, 218.00, 218.00, 5.00, 519.33, 51~
## $ field21 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ eva_hrs_mission <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.0~
## $ total_eva_hrs <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.0~
# We will remove original_name because it contains the Cyrillic letters as well, we don't need that.
# Also we will remove total hours of all missions per astronaut (total_hours_sum), total number of missions, as well as total hours of EVAs, to avoid confusion. We can get those summaries from hours_mission, mission_number, and eva_hrs_mission.
astronauts <- astronauts %>%
dplyr::select(-c(original_name, total_hrs_sum, total_eva_hrs, total_number_of_missions))
# Occupation will need correction
astronauts %>%
distinct(occupation)
## # A tibble: 12 x 1
## occupation
## <chr>
## 1 pilot
## 2 PSP
## 3 Pilot
## 4 commander
## 5 MSP
## 6 flight engineer
## 7 Other (Journalist)
## 8 Flight engineer
## 9 Other (space tourist)
## 10 Other (Space tourist)
## 11 Space tourist
## 12 spaceflight participant
astronauts %>%
distinct(military_civilian)
## # A tibble: 2 x 1
## military_civilian
## <chr>
## 1 military
## 2 civilian
# Correct the coding
astronauts <- astronauts %>%
mutate(
occupation = dplyr::recode(occupation, "Other (space tourist)" = "space_tourist",
"Other (Space tourist)" = "space_tourist",
"Space tourist" = "space_tourist",
"Pilot" = "pilot",
"spaceflight participant" = "spaceflight_participant",
"flight engineer" = "flight_engeneer",
"Flight engineer" = "flight_engeneer",
"Other (Journalist)" = "jounralist")) %>%
rename(eva_instances_per_mission = field21, country = nationality) %>%
mutate_if(is.character, as.factor)
# I could not find what do PSP and MSP mean
# Gender
astronauts %>%
count(sex)
## # A tibble: 2 x 2
## sex n
## <fct> <int>
## 1 female 143
## 2 male 1134
# We have ten time more male than female astronauts
# We can also look at the occupation
astronauts %>%
count(occupation, sort = TRUE)
## # A tibble: 8 x 2
## occupation n
## <fct> <int>
## 1 MSP 498
## 2 commander 315
## 3 pilot 197
## 4 flight_engeneer 196
## 5 PSP 59
## 6 space_tourist 10
## 7 jounralist 1
## 8 spaceflight_participant 1
# By country
astronauts %>%
count(country, sort = TRUE)
## # A tibble: 40 x 2
## country n
## <fct> <int>
## 1 U.S. 854
## 2 U.S.S.R/Russia 273
## 3 Japan 20
## 4 Canada 18
## 5 France 18
## 6 Germany 16
## 7 China 14
## 8 Italy 13
## 9 U.K./U.S. 6
## 10 Australia 4
## # ... with 30 more rows
# Since there are rather few astronauts with nationalities other than US or Soviet / Russian, we will collapse all other groups into 'Other'
astronauts <- astronauts %>%
mutate(country_group = fct_other(country, keep = c("U.S.", "U.S.S.R/Russia")))
astronauts %>%
ggplot(aes(fct_rev(country_group))) +
geom_bar(fill = "steelblue") +
labs(title = "Where do astronauts come from?",
y = "Count", x = NULL) +
coord_flip()
We can see that the US had twice as more astronauts than all other countries combined.
# Plot the top 20 astronauts by time spent in space
astronauts %>%
group_by(name) %>%
summarise(total_hours = sum(hours_mission)) %>%
mutate(days_in_space = total_hours/24) %>%
slice_max(total_hours, n = 20) %>%
ggplot(aes(y = fct_reorder(name, total_hours),
x = days_in_space, color = days_in_space)) +
geom_point(size = 3) +
geom_segment(aes(xend = 100,yend = name),size=2) +
labs(title = "Astronauts with the most days in space",x = "Days spent in space", y = NULL) +
theme(legend.position = "none") +
xlim(100, 900)
# Compare the number of missions through the years
astronauts %>%
group_by(year_of_mission, country_group) %>%
summarise(n = n()) %>%
ggplot(aes(x = year_of_mission, y = n, color = country_group)) +
geom_line(size = 1, alpha = 0.8) +
labs(title = "Number of missions per country by year",
x = "Year of the mission", y = "Number of missions",
color = "Country") +
guides(color = guide_legend(override.aes = list(alpha = 1)))
We can see that Soviet Union / Russia had more missions than the US at some point during the 70s, up to the 1980. Also, they had slightly more missions in past few years. Otherwise, the US had tremendously more missions than anyone else, with five large peaks from the 80s to the end of the 20th century.
Again, we will only look at the most frequent spacecrafts
# Look at the spacecrafts
astronauts %>%
count(in_orbit, country_group) %>%
arrange(desc(n)) %>%
slice_max(n, n = 5)
## # A tibble: 5 x 3
## in_orbit country_group n
## <fct> <fct> <int>
## 1 ISS U.S.S.R/Russia 78
## 2 ISS U.S. 71
## 3 Mir U.S.S.R/Russia 60
## 4 ISS Other 25
## 5 Salyut 6 U.S.S.R/Russia 23
# In orbit spacecraft by country
astronauts %>%
filter(in_orbit %in% c("ISS", "Mir","Salyut 6", "STS-42")) %>%
ggplot(aes(fct_rev(in_orbit), fill = country_group)) +
geom_bar() +
coord_flip() +
labs(title = "Most visited space stations in orbit so far",
y = "Number of visits", x = "Name of the spacecraft",
caption = expression(paste(italic("Note."), " STS-42 was a spacelab module on Discovery shuttle.")))
Here it is interesting that Russians had if not the largest proportion of visits to ISS.
astr_country_hours <- astronauts %>%
group_by(country_group) %>%
summarise(number_of_missions = n(),
total_hours = sum(hours_mission)) %>%
mutate(country_group = fct_reorder(country_group, total_hours),
days_in_space = total_hours/24)
plot_1 <- astr_country_hours %>%
ggplot(aes(x = country_group, y = number_of_missions)) +
geom_col(fill = "steelblue") +
labs(x = "Country", y = "Number of missions")
plot_2 <- astr_country_hours %>%
ggplot(aes(x = country_group, y = days_in_space)) +
geom_col(fill = "steelblue") +
labs(x = "Country", y = "Total dyas in space") +
scale_y_continuous(labels = comma)
p = ggarrange(plot_1, plot_2, ncol = 2)
annotate_figure(p,top = text_grob("Total number of missions and hours in space and by country"))
Interestingly, Russians have more total hours in space, while Americans have greater number of missions. It means that Russian missions are fewer but longer. Country membership (US, Soviet / Russian or Other) could be a predictor of mission duration.
# Military vs civilian astronauts count
bar_1 <- astronauts %>%
ggplot(aes(x = military_civilian, fill = country_group)) +
geom_bar(position = "dodge") +
labs(fill = "Country", x = NULL, y = "Count") +
theme(legend.position = "none")
# Military vs civilian by hours in space
bar_2 <- astronauts %>%
ggplot(aes(x = military_civilian, y = hours_mission, fill = country_group)) +
geom_col(position = "dodge") +
labs(fill = "Country", x = NULL, y = "Hours in space") +
scale_y_continuous(labels = comma) +
theme(legend.position = "none")
p2 = ggarrange(bar_1, bar_2, ncol = 2, common.legend = TRUE, legend = "bottom")
annotate_figure(p2,top = text_grob("Military and civilian personnel in space"))
scatter_1 <- astronauts %>%
filter(eva_hrs_mission > 0) %>%
ggplot(aes(x = year_of_mission, y = eva_hrs_mission, color = country_group)) +
geom_point() +
labs(title = "Length of EVA by year", color = "Country", x = "Year of the mission", y = "Duration of EVA (hrs)",
caption = expression(paste(italic("Note."), " EVA - extravehicular activity"))) +
geom_label(data = subset(astronauts, eva_hrs_mission > 75),
aes(label = ascend_shuttle),
nudge_y = -5,
color = "darkblue") +
theme(legend.position = "none")
scatter_2 <- astronauts %>%
filter(hours_mission > 0) %>%
ggplot(aes(x = year_of_mission, y = hours_mission, color = country_group)) +
geom_point() +
scale_y_log10() +
labs(title = "Length of mission by year", x = "Year of the mission",
y = "Duration of mission in hours (log10)")
ggarrange(scatter_1, scatter_2, ncol = 2, common.legend = TRUE, legend = "bottom")
Length of missions is increased over time. We see that Soyuz TM-26 had an extravehicular activity for over 75 hours. The name of the mission was “24”, but the name of the ascend shuttle was shown to avoid confusion.
We will try to predict the total duration of mission according to extravehicular activity (whether there was one or not), country (US or Russia), and the year of the mission.
We will dichotomize the eva_hrs_mission variable (there are too many missions with no extravehicular activity). We will also take a log10 of hours_mission variable.
# Make a dichotomous variable eva_hrs_mission
astronauts %>%
summarize(eva_hrs_mission) %>%
arrange(eva_hrs_mission)
## # A tibble: 1,277 x 1
## eva_hrs_mission
## <dbl>
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 0
## 8 0
## 9 0
## 10 0
## # ... with 1,267 more rows
astronauts %>%
ggplot(aes(hours_mission)) +
geom_histogram(color = "white", fill = "steelblue") +
labs(title = "Hours mission variable distribution")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Drop the level 'Other' from country_group variable
astronauts_drop <- astronauts %>%
filter(country_group %in% c("U.S.", "U.S.S.R/Russia"),
hours_mission > 10) %>%
droplevels() %>%
mutate(eva_group = ifelse(eva_hrs_mission == 0, "No EVA", "EVA"),
eva_group = as.factor(eva_group),
country_group = relevel(country_group, "U.S."),
hours_mission_log = log10(hours_mission + 0.01))
# Plot hours_mission_log
astronauts_drop %>%
ggplot(aes(hours_mission_log)) +
geom_histogram(color = "white", fill = "steelblue", bins = 15) +
labs(title = "Hours mission variable distribution")
astronauts_drop %>%
count(eva_group)
## # A tibble: 2 x 2
## eva_group n
## <fct> <int>
## 1 EVA 347
## 2 No EVA 764
# Fit the model
model_1 <- lm(hours_mission_log ~ year_of_mission, data = astronauts_drop)
# Look at the summary and coefficients
glance(model_1)
## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.256 0.255 0.491 382. 2.92e-73 1 -785. 1576. 1591.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
tidy(model_1, conf.int = TRUE)
## # A tibble: 2 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -43.5 2.36 -18.4 2.06e-66 -48.2 -38.9
## 2 year_of_mission 0.0231 0.00118 19.5 2.92e-73 0.0208 0.0255
The single predictor seems to explain considerable amount of mission hours’ variance. The coefficient is also significant, however, since we logged the outcome variable, the interpretation of the unstandaridzed estimate would not be straightforward. However, the year of the mission is positively related to the log10 of mission duration. This suggest that as time goes by, the space missions tend to be longer.
check_model(model_1)
## Loading required namespace: qqplotr
As for assumptions, the situation does not appear really nice. First, there is a slight deviation from linearity. The residuals do not form a normal distribution around 0. We have 2 peaks. The Q-Q plot also shows this pattern. Finally, the scale location plot shows clear pattern and the reference line is not exactly straight. This indicates heteroskedasticity.
We will add the rest of our predictors to the model.
# Fit model 2
model_2 <- lm(hours_mission_log ~ year_of_mission + country_group + eva_group, data =
astronauts_drop)
# Check the summary and coefficients of the more complex model
glance(model_2)
## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.561 0.560 0.377 471. 2.82e-197 3 -492. 994. 1019.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
tidy(model_2, conf.int = TRUE)
## # A tibble: 4 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -40.6 1.90 -21.4 3.22e-85 -44.3 -36.9
## 2 year_of_mission 0.0217 0.000949 22.9 3.02e-95 0.0199 0.0236
## 3 country_groupU.S.S.R~ 0.542 0.0271 20.0 4.05e-76 0.488 0.595
## 4 eva_groupNo EVA -0.378 0.0257 -14.7 8.83e-45 -0.428 -0.327
The model 2 explains 56% of the outcome’s variance, with all predictors being significant. Being a Russian is associated with the increase in log10 of mission hours. This is in line from what we saw in EDA. Absence of extravehicular activities is negatively associated with the mission duration, which is somewhat obvious finding.
check_model(model_2)
The situation with this model appears somewhat better, except for the scale location plot, which clearly shows a pattern and a wiggly reference line. We have no problems with multicollinearity detected. The residuals appear somewhat normal and the reference line in residual plot is not deviating much from straight. However, a strange pattern can also be seen at the residual plot, we have two long groups of points. This is because in the log10 of mission hours distribution, we saw two peaks.
compare_performance(model_1, model_2)
## # Comparison of Model Performance Indices
##
## Name | Model | AIC | BIC | R2 | R2 (adj.) | RMSE | Sigma
## -------------------------------------------------------------------------
## model_1 | lm | 1575.807 | 1590.846 | 0.256 | 0.255 | 0.490 | 0.491
## model_2 | lm | 993.905 | 1018.970 | 0.561 | 0.560 | 0.377 | 0.377
anova(model_1, model_2)
## Analysis of Variance Table
##
## Model 1: hours_mission_log ~ year_of_mission
## Model 2: hours_mission_log ~ year_of_mission + country_group + eva_group
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1109 267.23
## 2 1107 157.71 2 109.52 384.38 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We see that F-change is better in favor of model 2. Remember that the F-statistic for model 1 is 382, and for model 2 is 471. Also, AIC, BIC and RMSE are indicate that the second model is better. Finally, the second model explains much more variance than the initial model.
Here we will check influential cases
# Calculate residuals and influence indices
astronauts_drop <- astronauts_drop %>%
mutate(
zresid = abs(rstandard(model_2)),
sresid = abs(rstudent(model_2)),
cook = cooks.distance(model_2),
leverage = hatvalues(model_2)
)
# Identify high studentized or standardized residuals
astronauts_drop %>%
filter(zresid > 3 | sresid > 3) %>%
dplyr::select(id, zresid, sresid, cook, leverage) %>%
print(n = Inf) # 24, 367, 449
## # A tibble: 3 x 5
## id zresid sresid cook leverage
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 24 3.23 3.25 0.0288 0.0109
## 2 367 3.37 3.39 0.00444 0.00156
## 3 449 3.03 3.04 0.0110 0.00477
# Identify influential cases
# Calculate the cutoff for leverage, 3 * average leverage
hat_cutoff <- (3 + 1)/nrow(astronauts_drop)*3
astronauts_drop %>%
filter(cook > 1 | leverage > hat_cutoff) %>%
dplyr::select(id, zresid, sresid, cook, leverage) %>%
print(n = Inf) # 24, 35, 55, 58, 62
## # A tibble: 5 x 5
## id zresid sresid cook leverage
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 24 3.23 3.25 0.0288 0.0109
## 2 35 0.260 0.260 0.000192 0.0112
## 3 55 0.673 0.673 0.00124 0.0108
## 4 58 0.690 0.689 0.00130 0.0108
## 5 62 0.358 0.358 0.000350 0.0108
We identified the most influential cases and outliers, those are cases: 24, 35, 55, 58, 62, 24, 367, 449; as well as high leverage points from the residuals vs leverage graph: 134, 135, 87, 84, 20.
We can remove them and see how the model will perform
# Remove outliers and influential cases
astronauts_updated <- astronauts_drop %>%
slice(-c(24, 35, 55, 58, 62, 24, 367, 449, 134, 135, 87, 84, 20))
# Fit model 2 again
model_2_updated <- lm(hours_mission_log ~ year_of_mission + country_group + eva_group, data =
astronauts_updated)
# Look at the summary and coefficients
glance(model_2_updated)
## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.574 0.572 0.371 491. 4.35e-202 3 -469. 947. 972.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
tidy(model_2_updated, conf.int = TRUE)
## # A tibble: 4 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -40.5 1.91 -21.2 6.74e-84 -44.3 -36.8
## 2 year_of_mission 0.0217 0.000955 22.7 7.88e-94 0.0198 0.0236
## 3 country_groupU.S.S.R~ 0.557 0.0268 20.8 3.90e-81 0.505 0.610
## 4 eva_groupNo EVA -0.380 0.0257 -14.8 2.57e-45 -0.430 -0.330
check_model(model_2_updated)
The plots did not change much. What we can do is to run a robust regression and compare the estimates with the original estimates, since we have too many influential outliers. However, robust regression is not a cure for heteroskedasticity. We will try it anyway.
model_robust <- rlm(hours_mission_log ~ year_of_mission + country_group + eva_group, data =
astronauts_updated)
summary(model_robust)
##
## Call: rlm(formula = hours_mission_log ~ year_of_mission + country_group +
## eva_group, data = astronauts_updated)
## Residuals:
## Min 1Q Median 3Q Max
## -1.033765 -0.208611 0.007168 0.206422 1.295050
##
## Coefficients:
## Value Std. Error t value
## (Intercept) -38.2899 1.7725 -21.6026
## year_of_mission 0.0205 0.0009 23.1790
## country_groupU.S.S.R/Russia 0.6223 0.0249 25.0357
## eva_groupNo EVA -0.3377 0.0238 -14.1817
##
## Residual standard error: 0.306 on 1095 degrees of freedom
The residual standard error i somewhat lower for the robust model, comparing to the updated model 2. On the other hand, unstandaridzed estimates appear very similar.
Therefore, we decided to keep updated model 2.
# Calculate coefficients and add standardized coefficients
model_2_updated_coeff <- tidy(model_2_updated, conf.int = TRUE) %>%
mutate(beta = lm.beta(model_2_updated)$standardized.coefficients) %>%
mutate_if(is.numeric, round, 2) %>%
dplyr::select(term, estimate, beta, everything())
# Summary table
glance(model_2_updated) %>%
round(2) %>%
kbl(caption = "Final model summary") %>%
kable_classic(full_width = FALSE, position = "left")
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.57 | 0.57 | 0.37 | 490.96 | 0 | 3 | -468.62 | 947.25 | 972.26 | 150.97 | 1095 | 1099 |
# Coefficients table
model_2_updated_coeff %>%
kbl(caption = "Final model coefficients") %>%
kable_classic(full_width = FALSE, position = "left")
| term | estimate | beta | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|---|
| (Intercept) | -40.53 | 0.00 | 1.91 | -21.21 | 0 | -44.28 | -36.78 |
| year_of_mission | 0.02 | 0.47 | 0.00 | 22.70 | 0 | 0.02 | 0.02 |
| country_groupU.S.S.R/Russia | 0.56 | 0.42 | 0.03 | 20.78 | 0 | 0.50 | 0.61 |
| eva_groupNo EVA | -0.38 | -0.31 | 0.03 | -14.80 | 0 | -0.43 | -0.33 |
In this project, we analyzed astronauts data from Tidytuesday, provided by NASA, Roskosmos and “fun-made websites”. The dataset was prepared by Georgios Karamanis.
We saw that the US had many more missions than any other country, but Soviet Union / Russia had more hours spent in space in total. Astronaut with the most days spent in space is Gennady Padalka around 800 days, followed by Krikalev Sergei, Kaleri Aleksandr, and so on.
Beside Mir and Salyut 6, Russians spent considerable time on International Space Station. Both Russians and Americans had more military belonging astronauts than civilians.
Regression. We performed hierarchical regression analysis, predicting mission duration by year, in the first model, and then by year, country (US or Russia) and presence of extravehicular activities (Present or Not present). The second model is significantly better, explaining 57% of the mission time. All predictors were significant - year of mission and country (Russia) in positive, and EVA (no Eva) in negative direction. Our finding from EDA that the Russians although have less missions, its astronauts spend more time on missions. Also, missions are getting longer as the time goes. and obviously no extravehicular activities are associated with shorter missions.
This model have serious assumptions violation and the results should be taken with a caution. The distribution of outcome variable was extremely skewed, and after transformation it resembled almost a bimodal distribution. We also performed a robust regression and compared the estimates with the original model. However, that could not solve the clearly heteroskedastic variance. Perhaps, different modeling method should be applied more robust to these kind of assumption violations.
МИР Space station
Source: Wikipedia